/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

/**
 * Parser for trec doc content, invoked on doc text excluding &lt;DOC&gt; and &lt;DOCNO&gt; which
 * are handled in TrecContentSource. Required to be stateless and hence thread safe.
 */
@SuppressWarnings("ClassInitializationDeadlock") // FIXME: may cause hangs!
public abstract class TrecDocParser {

  /** Types of trec parse paths, */
  public enum ParsePathType {
    GOV2,
    FBIS,
    FT,
    FR94,
    LATIMES
  }

  /** trec parser type used for unknown extensions */
  public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;

  static final Map<ParsePathType, TrecDocParser> pathType2Parser;

  static {
    pathType2Parser =
        Collections.unmodifiableMap(
            new EnumMap<>(
                Map.of(
                    ParsePathType.GOV2, new TrecGov2Parser(),
                    ParsePathType.FBIS, new TrecFBISParser(),
                    ParsePathType.FR94, new TrecFR94Parser(),
                    ParsePathType.FT, new TrecFTParser(),
                    ParsePathType.LATIMES, new TrecLATimesParser())));
  }

  static final Map<String, ParsePathType> pathName2Type;

  static {
    Map<String, ParsePathType> name2Type = new HashMap<>();
    for (ParsePathType ppt : ParsePathType.values()) {
      name2Type.put(ppt.name().toUpperCase(Locale.ROOT), ppt);
    }
    pathName2Type = Collections.unmodifiableMap(name2Type);
  }

  /** max length of walk up from file to its ancestors when looking for a known path type */
  private static final int MAX_PATH_LENGTH = 10;

  /** Compute the path type of a file by inspecting name of file and its parents */
  public static ParsePathType pathType(Path f) {
    int pathLength = 0;
    while (f != null && f.getFileName() != null && ++pathLength < MAX_PATH_LENGTH) {
      ParsePathType ppt = pathName2Type.get(f.getFileName().toString().toUpperCase(Locale.ROOT));
      if (ppt != null) {
        return ppt;
      }
      f = f.getParent();
    }
    return DEFAULT_PATH_TYPE;
  }

  /**
   * parse the text prepared in docBuf into a result DocData, no synchronization is required.
   *
   * @param docData reusable result
   * @param name name that should be set to the result
   * @param trecSrc calling trec content source
   * @param docBuf text to parse
   * @param pathType type of parsed file, or null if unknown - may be used by parsers to alter their
   *     behavior according to the file path type.
   */
  public abstract DocData parse(
      DocData docData,
      String name,
      TrecContentSource trecSrc,
      StringBuilder docBuf,
      ParsePathType pathType)
      throws IOException;

  /**
   * strip tags from <code>buf</code>: each tag is replaced by a single blank.
   *
   * @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is
   *     unmodified).
   */
  public static String stripTags(StringBuilder buf, int start) {
    return stripTags(buf.substring(start), 0);
  }

  /**
   * strip tags from input.
   *
   * @see #stripTags(StringBuilder, int)
   */
  public static String stripTags(String buf, int start) {
    if (start > 0) {
      buf = buf.substring(start);
    }
    return buf.replaceAll("<[^>]*>", " ");
  }

  /**
   * Extract from <code>buf</code> the text of interest within specified tags
   *
   * @param buf entire input text
   * @param startTag tag marking start of text of interest
   * @param endTag tag marking end of text of interest
   * @param maxPos if &ge; 0 sets a limit on start of text of interest
   * @return text of interest or null if not found
   */
  public static String extract(
      StringBuilder buf, String startTag, String endTag, int maxPos, String[] noisePrefixes) {
    int k1 = buf.indexOf(startTag);
    if (k1 >= 0 && (maxPos < 0 || k1 < maxPos)) {
      k1 += startTag.length();
      int k2 = buf.indexOf(endTag, k1);
      if (k2 >= 0 && (maxPos < 0 || k2 < maxPos)) { // found end tag with allowed range
        if (noisePrefixes != null) {
          for (String noise : noisePrefixes) {
            int k1a = buf.indexOf(noise, k1);
            if (k1a >= 0 && k1a < k2) {
              k1 = k1a + noise.length();
            }
          }
        }
        return buf.substring(k1, k2).trim();
      }
    }
    return null;
  }

  // public static void main(String[] args) {
  //  System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one
  // more space>?",0));
  // }

}
